#install.packages("datasets")
#check the data by writing state.x77
Income = state.x77[,"Income"]
Pop = state.x77[,"Population"]
Area = state.x77[,"Area"]
Illit = state.x77[,"Illiteracy"]
Murder = state.x77[,"Murder"]
fit=lm(Income~Pop+Area+Illit+Murder)
summary(fit)
## 
## Call:
## lm(formula = Income ~ Pop + Area + Illit + Murder)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -795.8 -336.4 -105.5  316.6 1121.8 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.700e+03  1.700e+02  27.654  < 2e-16 ***
## Pop          4.020e-02  1.703e-02   2.361  0.02263 *  
## Area         3.032e-03  8.481e-04   3.575  0.00085 ***
## Illit       -4.009e+02  1.656e+02  -2.421  0.01957 *  
## Murder      -2.448e+01  2.969e+01  -0.824  0.41406    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 487.7 on 45 degrees of freedom
## Multiple R-squared:  0.4214, Adjusted R-squared:   0.37 
## F-statistic: 8.194 on 4 and 45 DF,  p-value: 4.715e-05
coefficients(fit) # this is betahat
##   (Intercept)           Pop          Area         Illit        Murder 
##  4.700257e+03  4.019908e-02  3.032067e-03 -4.008758e+02 -2.447963e+01
RSS<-sum(residuals(fit)**2) #thiss SSR
sigmasquare<-RSS/(length(residuals(fit))-4) #S2 in a. That is estimate of sigmasquare

#b starts
r<-fit$residuals
plot(r,Area) #b # we plotted 

plot of chunk unnamed-chunk-1

hist(residuals(fit)) #b #histogram of residuals

plot of chunk unnamed-chunk-1

mean(r)
## [1] 1.054781e-15
mean(residuals(fit))
## [1] 1.054781e-15
#c
logArea=log(Area)
fitc=lm(Income~Pop+logArea+Illit+Murder) #this is refined model
summary(fitc)
## 
## Call:
## lm(formula = Income ~ Pop + logArea + Illit + Murder)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -837.33 -364.14  -29.64  265.45 2225.23 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 5136.49137  777.88348   6.603 3.96e-08 ***
## Pop            0.03484    0.01919   1.816   0.0761 .  
## logArea      -30.20394   73.93083  -0.409   0.6848    
## Illit       -504.10599  193.89257  -2.600   0.0126 *  
## Murder         8.56928   35.12602   0.244   0.8084    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 551.6 on 45 degrees of freedom
## Multiple R-squared:  0.2599, Adjusted R-squared:  0.1941 
## F-statistic:  3.95 on 4 and 45 DF,  p-value: 0.007855
RSSc<-sum(residuals(fitc)**2)
#means parameters in front of murder and logarea zero. i.e. betas are zero in fron of them.


#new model # 
fitc2=lm(Income~ Pop +Illit)
summary(fitc2)
## 
## Call:
## lm(formula = Income ~ Pop + Illit)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -848.72 -349.42  -60.84  294.78 2171.82 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 4833.16224  176.61055  27.366  < 2e-16 ***
## Pop            0.03555    0.01741   2.042 0.046780 *  
## Illit       -468.63466  127.49422  -3.676 0.000608 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 540.8 on 47 degrees of freedom
## Multiple R-squared:  0.257,  Adjusted R-squared:  0.2253 
## F-statistic: 8.127 on 2 and 47 DF,  p-value: 0.0009307
RSSc2<-sum(residuals(fitc2)**2)
RSSc2
## [1] 13747064
anova(fitc2)
## Analysis of Variance Table
## 
## Response: Income
##           Df   Sum Sq Mean Sq F value    Pr(>F)    
## Pop        1   802184  802184  2.7426 0.1043685    
## Illit      1  3951845 3951845 13.5110 0.0006075 ***
## Residuals 47 13747064  292491                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fc<-(RSSc2-RSSc)/RSSc*(45/2)
fc_value=1-pf(fc,2,45)
fc_value
## [1] 0.9160031
#d
Illit2=Illit^2
refit3<-lm(Income~ Pop +Illit+Illit2)
summary(refit3)
## 
## Call:
## lm(formula = Income ~ Pop + Illit + Illit2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -864.42 -363.82   18.94  231.73 1915.90 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 4091.32194  376.30857  10.872 2.66e-14 ***
## Pop            0.02364    0.01758   1.345   0.1852    
## Illit        931.78009  645.73570   1.443   0.1558    
## Illit2      -488.22974  221.03369  -2.209   0.0322 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 519.8 on 46 degrees of freedom
## Multiple R-squared:  0.3282, Adjusted R-squared:  0.2844 
## F-statistic: 7.491 on 3 and 46 DF,  p-value: 0.0003485
anova(refit3)
## Analysis of Variance Table
## 
## Response: Income
##           Df   Sum Sq Mean Sq F value    Pr(>F)    
## Pop        1   802184  802184  2.9689 0.0915941 .  
## Illit      1  3951845 3951845 14.6261 0.0003929 ***
## Illit2     1  1318265 1318265  4.8790 0.0322058 *  
## Residuals 46 12428798  270191                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
RSS4<-sum(residuals(refit3)**2)
RSS4
## [1] 12428798
f_2<-((RSSc2-RSS4)/RSS4)*(46/1)
f_value_2=1-pf(f_2,1,46) # so reject the null hypothesis that means illet2 is significant. 
f_value_2
## [1] 0.03220576
#e
Z<-Income-0.05*Pop+500*Illit
fit5<-lm(Z~1)
summary(fit5)
## 
## Call:
## lm(formula = Z ~ 1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -817.38 -374.70  -93.48  233.46 2238.27 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4808.48      75.47   63.71   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 533.7 on 49 degrees of freedom
anova(fit5)
## Analysis of Variance Table
## 
## Response: Z
##           Df   Sum Sq Mean Sq F value Pr(>F)
## Residuals 49 13956021  284817
RSS5<-sum(residuals(fit5)**2)
RSS5
## [1] 13956021
f_value_e<-(RSS5-RSSc2)/RSSc2*(47/2)
p_value_e<-1-pf(f_value_e,2,47)
print(p_value_e)
## [1] 0.701513